In [10]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


# libraries for models
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier

# metrics evaluation libraries
from sklearn.metrics import auc, classification_report, confusion_matrix, roc_curve, RocCurveDisplay
In [11]:
project_data = pd.read_csv("data.csv")
project_data = project_data.drop(columns=["id"]) # droping unwanted columns
In [12]:
project_data.head()
Out[12]:
diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean ... compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst sensor_id sensor_name sensor_data biomarker_name urine_biomarker_value
0 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 ... 0.6656 0.7119 0.2654 0.4601 0.11890 1 NanoSensor1 15.5 Nuclear Matrix Protein 22 (NMP22) 20.3
1 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 ... 0.1866 0.2416 0.1860 0.2750 0.08902 1 NanoSensor1 15.5 Nuclear Matrix Protein 22 (NMP22) 18.9
2 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 ... 0.4245 0.4504 0.2430 0.3613 0.08758 1 NanoSensor1 15.5 Nuclear Matrix Protein 22 (NMP22) 22.1
3 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 0.2597 ... 0.8663 0.6869 0.2575 0.6638 0.17300 1 NanoSensor1 15.5 Nuclear Matrix Protein 22 (NMP22) 19.5
4 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 0.1809 ... 0.2050 0.4000 0.1625 0.2364 0.07678 1 NanoSensor1 15.5 Nuclear Matrix Protein 22 (NMP22) 45.2

5 rows × 36 columns

In [13]:
project_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 36 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  569 non-null    float64
 15  smoothness_se            569 non-null    float64
 16  compactness_se           569 non-null    float64
 17  concavity_se             569 non-null    float64
 18  concave points_se        569 non-null    float64
 19  symmetry_se              569 non-null    float64
 20  fractal_dimension_se     569 non-null    float64
 21  radius_worst             569 non-null    float64
 22  texture_worst            569 non-null    float64
 23  perimeter_worst          569 non-null    float64
 24  area_worst               569 non-null    float64
 25  smoothness_worst         569 non-null    float64
 26  compactness_worst        569 non-null    float64
 27  concavity_worst          569 non-null    float64
 28  concave points_worst     569 non-null    float64
 29  symmetry_worst           569 non-null    float64
 30  fractal_dimension_worst  569 non-null    float64
 31  sensor_id                569 non-null    int64  
 32  sensor_name              569 non-null    object 
 33  sensor_data              569 non-null    float64
 34  biomarker_name           569 non-null    object 
 35  urine_biomarker_value    569 non-null    float64
dtypes: float64(32), int64(1), object(3)
memory usage: 160.2+ KB
In [14]:
project_data.describe()
Out[14]:
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst sensor_id sensor_data urine_biomarker_value
count 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 ... 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000
mean 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.048919 0.181162 0.062798 ... 880.583128 0.132369 0.254265 0.272188 0.114606 0.290076 0.083946 2.367311 36.987346 29.605800
std 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.038803 0.027414 0.007060 ... 569.356993 0.022832 0.157336 0.208624 0.065732 0.061867 0.018061 0.847773 21.069023 13.561382
min 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 0.106000 0.049960 ... 185.200000 0.071170 0.027290 0.000000 0.000000 0.156500 0.055040 1.000000 3.800000 4.200000
25% 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.020310 0.161900 0.057700 ... 515.300000 0.116600 0.147200 0.114500 0.064930 0.250400 0.071460 2.000000 15.500000 19.500000
50% 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.033500 0.179200 0.061540 ... 686.500000 0.131300 0.211900 0.226700 0.099930 0.282200 0.080040 3.000000 42.200000 31.400000
75% 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.074000 0.195700 0.066120 ... 1084.000000 0.146000 0.339100 0.382900 0.161400 0.317900 0.092080 3.000000 53.700000 41.300000
max 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.201200 0.304000 0.097440 ... 4254.000000 0.222600 1.058000 1.252000 0.291000 0.663800 0.207500 3.000000 62.400000 55.600000

8 rows × 33 columns

In [15]:
project_data.shape
Out[15]:
(569, 36)
In [7]:
project_data.columns
Out[7]:
Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'sensor_id', 'sensor_name',
       'sensor_data', 'biomarker_name', 'urine_biomarker_value'],
      dtype='object')
In [8]:
project_data.isna().sum()
Out[8]:
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
sensor_id                  0
sensor_name                0
sensor_data                0
biomarker_name             0
urine_biomarker_value      0
dtype: int64
In [9]:
plt.figure(figsize=(20,10))
sns.countplot(x=project_data["diagnosis"])
print(project_data["diagnosis"].value_counts())
diagnosis
B    357
M    212
Name: count, dtype: int64
In [12]:
numeric_columns = project_data.select_dtypes(include=['float64', 'int64']).columns
corr = project_data[numeric_columns].corr()

plt.figure(figsize=(20, 10))
sns.heatmap(corr, annot=True, cmap="YlGnBu")
plt.show()
In [13]:
X_train=project_data.drop(columns=["diagnosis"])
y_train=project_data["diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)
In [14]:
print('Train dataset shape:',X_train.shape)
print('Test dataset shape', y_train.shape)
Train dataset shape: (455, 35)
Test dataset shape (455,)
In [15]:
numeric_columns = X_train.select_dtypes(exclude='object').columns
print(numeric_columns)
print('*'*100)
categorical_columns = X_train.select_dtypes(include='object').columns
print(categorical_columns)
Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'sensor_id', 'sensor_data',
       'urine_biomarker_value'],
      dtype='object')
****************************************************************************************************
Index(['sensor_name', 'biomarker_name'], dtype='object')
In [21]:
numeric_features = Pipeline([
    ('handlingmissingvalues',SimpleImputer(strategy='median')),
    ('scaling',StandardScaler(with_mean=True))
])

print(numeric_features)
print('*'*100)

categorical_features = Pipeline([
    ('handlingmissingvalues',SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder()),
    ('scaling', StandardScaler(with_mean=False))
])

print(categorical_features)

processing = ColumnTransformer([
    ('numeric', numeric_features, numeric_columns),
    ('categorical', categorical_features, categorical_columns)
])

processing
Pipeline(steps=[('handlingmissingvalues', SimpleImputer(strategy='median')),
                ('scaling', StandardScaler())])
****************************************************************************************************
Pipeline(steps=[('handlingmissingvalues',
                 SimpleImputer(strategy='most_frequent')),
                ('encoding', OneHotEncoder()),
                ('scaling', StandardScaler(with_mean=False))])
Out[21]:
ColumnTransformer(transformers=[('numeric',
                                 Pipeline(steps=[('handlingmissingvalues',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaling',
                                                  StandardScaler())]),
                                 Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture...
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'sensor_id', 'sensor_data',
       'urine_biomarker_value'],
      dtype='object')),
                                ('categorical',
                                 Pipeline(steps=[('handlingmissingvalues',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('encoding', OneHotEncoder()),
                                                 ('scaling',
                                                  StandardScaler(with_mean=False))]),
                                 Index(['sensor_name', 'biomarker_name'], dtype='object'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
ColumnTransformer(transformers=[('numeric',
                                 Pipeline(steps=[('handlingmissingvalues',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaling',
                                                  StandardScaler())]),
                                 Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture...
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'sensor_id', 'sensor_data',
       'urine_biomarker_value'],
      dtype='object')),
                                ('categorical',
                                 Pipeline(steps=[('handlingmissingvalues',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('encoding', OneHotEncoder()),
                                                 ('scaling',
                                                  StandardScaler(with_mean=False))]),
                                 Index(['sensor_name', 'biomarker_name'], dtype='object'))])
Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'sensor_id', 'sensor_data',
       'urine_biomarker_value'],
      dtype='object')
SimpleImputer(strategy='median')
StandardScaler()
Index(['sensor_name', 'biomarker_name'], dtype='object')
SimpleImputer(strategy='most_frequent')
OneHotEncoder()
StandardScaler(with_mean=False)
In [ ]:
def prepare_confusion_matrix(algo, model):
    print(algo)
    plt.figure(figsize=(12,8))
    pred = model.predict(X_test)
    cm = confusion_matrix(y_test, pred)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, fmt='g', ax=ax)
    plt.show()
    
    # labels, title and ticks
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
    ax.set_title('Confusion Matrix'); 
In [ ]:
def prepare_classification_report(algo, model):
    print(algo+' Report :')
    pred = model.predict(X_test)
    print(classification_report(y_test, pred))
In [ ]:
def prepare_roc_curve(algo, model):
    print(algo)
    y_pred_proba = model.predict_proba(X_test)[::,1]
    fpr, tpr, thresholds = roc_curve(y_test,  y_pred_proba)
    roc_auc = auc(fpr, tpr)
    curve = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
    curve.plot()
    plt.show()
In [ ]:
algorithms = [('bagging classifier', BaggingClassifier()), 
              ('KNN classifier', KNeighborsClassifier()), 
              ('Random Forest calssifier', RandomForestClassifier()), 
              ('Adaboost classifier', AdaBoostClassifier()), 
              ('Gradientboot classifier',GradientBoostingClassifier()),
              ('MLP', MLPClassifier())
             ]

trained_models = []
model_and_score = {}

for index, tup in enumerate(algorithms):
    model = prepare_model(tup[1])
    model_and_score[tup[0]] = str(model.score(X_train,y_train)*100)+"%"
    trained_models.append((tup[0],model))
In [ ]:
print(model_and_score)
In [29]:
print(model_and_score)
{'bagging classifier': '98.24175824175823%', 'KNN classifier': '94.28571428571428%', 'Random Forest calssifier': '100.0%', 'Adaboost classifier': '97.8021978021978%', 'Gradientboot classifier': '100.0%', 'MLP': '94.5054945054945%'}
In [32]:
for index, tup in enumerate(trained_models):
    prepare_confusion_matrix(tup[0], tup[1])
bagging classifier
KNN classifier
Random Forest calssifier
Adaboost classifier
Gradientboot classifier
MLP
In [33]:
for index, tup in enumerate(trained_models):
    prepare_classification_report(tup[0], tup[1])
    print("\n")
bagging classifier Report :
              precision    recall  f1-score   support

           B       0.95      0.93      0.94        81
           M       0.83      0.88      0.85        33

    accuracy                           0.91       114
   macro avg       0.89      0.90      0.90       114
weighted avg       0.91      0.91      0.91       114



KNN classifier Report :
              precision    recall  f1-score   support

           B       0.96      0.96      0.96        81
           M       0.91      0.91      0.91        33

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



Random Forest calssifier Report :
              precision    recall  f1-score   support

           B       0.99      0.95      0.97        81
           M       0.89      0.97      0.93        33

    accuracy                           0.96       114
   macro avg       0.94      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114



Adaboost classifier Report :
              precision    recall  f1-score   support

           B       0.99      0.94      0.96        81
           M       0.86      0.97      0.91        33

    accuracy                           0.95       114
   macro avg       0.93      0.95      0.94       114
weighted avg       0.95      0.95      0.95       114



Gradientboot classifier Report :
              precision    recall  f1-score   support

           B       0.97      0.95      0.96        81
           M       0.89      0.94      0.91        33

    accuracy                           0.95       114
   macro avg       0.93      0.95      0.94       114
weighted avg       0.95      0.95      0.95       114



MLP Report :
              precision    recall  f1-score   support

           B       0.99      0.95      0.97        81
           M       0.89      0.97      0.93        33

    accuracy                           0.96       114
   macro avg       0.94      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114



In [34]:
le = LabelEncoder()
project_data['diagnosis_encoded'] = le.fit_transform(project_data['diagnosis'])

# Plot KDEs for numeric features by diagnosis
for column in numeric_columns:
    plt.figure(figsize=(12, 8))
    sns.kdeplot(data=project_data, x=column, hue='diagnosis', palette="crest", fill=True)
    plt.title(f'Distribution of {column} by Diagnosis')
    plt.show()

# Plotting Pie Chart for Diagnosis
plt.figure(figsize=(8, 8))
plt.pie(project_data['diagnosis'].value_counts(), labels=project_data['diagnosis'].value_counts().index, autopct='%1.1f%%', colors=['lightcoral', 'skyblue'])
plt.title('Distribution of Diagnosis')
plt.show()
In [47]:
print(len(X_train), len(Y_train_encoded))
272 272
In [58]:
df=pd.read_csv('data.csv')
In [16]:
df.head(10)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[16], line 1
----> 1 df.head(10)

NameError: name 'df' is not defined
In [60]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 37 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             569 non-null    float64
 15  area_se                  569 non-null    float64
 16  smoothness_se            569 non-null    float64
 17  compactness_se           569 non-null    float64
 18  concavity_se             569 non-null    float64
 19  concave points_se        569 non-null    float64
 20  symmetry_se              569 non-null    float64
 21  fractal_dimension_se     569 non-null    float64
 22  radius_worst             569 non-null    float64
 23  texture_worst            569 non-null    float64
 24  perimeter_worst          569 non-null    float64
 25  area_worst               569 non-null    float64
 26  smoothness_worst         569 non-null    float64
 27  compactness_worst        569 non-null    float64
 28  concavity_worst          569 non-null    float64
 29  concave points_worst     569 non-null    float64
 30  symmetry_worst           569 non-null    float64
 31  fractal_dimension_worst  569 non-null    float64
 32  sensor_id                569 non-null    int64  
 33  sensor_name              569 non-null    object 
 34  sensor_data              569 non-null    float64
 35  biomarker_name           569 non-null    object 
 36  urine_biomarker_value    569 non-null    float64
dtypes: float64(32), int64(2), object(3)
memory usage: 164.6+ KB
In [61]:
df.isna().sum()
Out[61]:
id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
sensor_id                  0
sensor_name                0
sensor_data                0
biomarker_name             0
urine_biomarker_value      0
dtype: int64
In [62]:
df.describe()
Out[62]:
id radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean ... area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst sensor_id sensor_data urine_biomarker_value
count 5.690000e+02 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 ... 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000
mean 3.037183e+07 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.048919 0.181162 ... 880.583128 0.132369 0.254265 0.272188 0.114606 0.290076 0.083946 2.367311 36.987346 29.605800
std 1.250206e+08 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.038803 0.027414 ... 569.356993 0.022832 0.157336 0.208624 0.065732 0.061867 0.018061 0.847773 21.069023 13.561382
min 8.670000e+03 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 0.106000 ... 185.200000 0.071170 0.027290 0.000000 0.000000 0.156500 0.055040 1.000000 3.800000 4.200000
25% 8.692180e+05 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.020310 0.161900 ... 515.300000 0.116600 0.147200 0.114500 0.064930 0.250400 0.071460 2.000000 15.500000 19.500000
50% 9.060240e+05 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.033500 0.179200 ... 686.500000 0.131300 0.211900 0.226700 0.099930 0.282200 0.080040 3.000000 42.200000 31.400000
75% 8.813129e+06 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.074000 0.195700 ... 1084.000000 0.146000 0.339100 0.382900 0.161400 0.317900 0.092080 3.000000 53.700000 41.300000
max 9.113205e+08 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.201200 0.304000 ... 4254.000000 0.222600 1.058000 1.252000 0.291000 0.663800 0.207500 3.000000 62.400000 55.600000

8 rows × 34 columns

In [63]:
df = df.dropna(axis=1)
In [65]:
df.shape
Out[65]:
(569, 37)
In [66]:
df['diagnosis'].value_counts()
Out[66]:
diagnosis
B    357
M    212
Name: count, dtype: int64
In [67]:
import seaborn as sns
In [69]:
lb=LabelEncoder()
In [72]:
plt.figure(figsize=(25,25))
Out[72]:
<Figure size 2500x2500 with 0 Axes>
<Figure size 2500x2500 with 0 Axes>
In [73]:
sns.pairplot(df.iloc[:, 1:5], hue="diagnosis")
Out[73]:
<seaborn.axisgrid.PairGrid at 0x28d8543f010>
In [74]:
X=df.iloc[:, 2:32].values
In [75]:
X
Out[75]:
array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])
In [76]:
y = df.iloc[:, 1].values
In [81]:
y
Out[81]:
array(['M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'B', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'B',
       'B', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M',
       'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M', 'M', 'B', 'M', 'B', 'M',
       'M', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'M', 'M', 'B', 'B', 'B',
       'M', 'B', 'B', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'B',
       'B', 'M', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'M', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'M',
       'B', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'M', 'B', 'B', 'M', 'B',
       'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'M', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'M', 'M',
       'B', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'M', 'M',
       'M', 'B', 'M', 'B', 'M', 'B', 'B', 'B', 'M', 'B', 'B', 'M', 'M',
       'B', 'M', 'M', 'M', 'M', 'B', 'M', 'M', 'M', 'B', 'M', 'B', 'M',
       'B', 'B', 'M', 'B', 'M', 'M', 'M', 'M', 'B', 'B', 'M', 'M', 'B',
       'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'M',
       'B', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'B',
       'B', 'B', 'B', 'M', 'B', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'M',
       'B', 'M', 'B', 'B', 'M', 'B', 'B', 'M', 'B', 'M', 'M', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B',
       'B', 'M', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'M', 'B', 'M', 'B',
       'B', 'B', 'B', 'M', 'M', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M',
       'B', 'M', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'M', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'M', 'M', 'B', 'M', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'B',
       'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'M',
       'B', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'M', 'B',
       'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'M', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'M',
       'B', 'B', 'M', 'B', 'M', 'B', 'B', 'M', 'B', 'M', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B',
       'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'M', 'B', 'B', 'M', 'B',
       'B', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'M', 'B', 'B', 'B',
       'B', 'B', 'M', 'B', 'B', 'M', 'B', 'M', 'B', 'M', 'M', 'B', 'B',
       'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'M', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'M', 'M', 'M', 'M', 'M', 'M', 'B'], dtype=object)
In [78]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)
In [79]:
from sklearn.preprocessing import StandardScaler
In [80]:
st  = StandardScaler()
In [82]:
X_train  = st.fit_transform(X_train)
X_test  = st.fit_transform(X_test)
In [83]:
X_train.shape
Out[83]:
(455, 30)
In [84]:
y_train.shape
Out[84]:
(455,)
In [85]:
from sklearn.linear_model import LogisticRegression, LinearRegression
In [86]:
log = LogisticRegression()
In [87]:
log.fit(X_train, y_train)
Out[87]:
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [88]:
log.score(X_train, y_train)
Out[88]:
0.989010989010989
In [89]:
from sklearn.metrics import accuracy_score, classification_report
In [90]:
accuracy_score(y_test, log.predict(X_test))
Out[90]:
0.956140350877193
In [91]:
print(classification_report(y_test, log.predict(X_test)))
              precision    recall  f1-score   support

           B       0.96      0.97      0.96        67
           M       0.96      0.94      0.95        47

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114

In [92]:
import pickle
In [95]:
pickle.dump(log, open("model.pkl", "wb"))
In [2]:
from sklearn.metrics import accuracy_score
In [3]:
accuracy_scor(y.y_pred)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 accuracy_scor(y.y_pred)

NameError: name 'accuracy_scor' is not defined
In [4]:
accuracy_score(y.y_pred)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 accuracy_score(y.y_pred)

NameError: name 'y' is not defined
In [5]:
from sklearn.metrics import accuracy_score

# Assuming y_true is your true labels and y_pred is your predicted labels
y_true = [1, 0, 1, 1, 0]
y_pred = [1, 0, 1, 0, 1]

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)

# Print or use the accuracy value as needed
print("Accuracy:", accuracy)
Accuracy: 0.6
In [7]:
df.head(1)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 1
----> 1 df.head(1)

NameError: name 'df' is not defined
In [8]:
df.head(1)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 1
----> 1 df.head(1)

NameError: name 'df' is not defined
In [17]:
import pandas as pd

# Assuming you have a DataFrame named df
data = {'Column1': [1, 2, 3], 'Column2': ['A', 'B', 'C']}
df = pd.DataFrame(data)

# Display the first row using head()
df.head(1)
Out[17]:
Column1 Column2
0 1 A
In [18]:
df.head(5)
Out[18]:
Column1 Column2
0 1 A
1 2 B
2 3 C
In [19]:
df.head(10)
Out[19]:
Column1 Column2
0 1 A
1 2 B
2 3 C
In [21]:
import pickle

with open("data.pkl", "wb") as f:
    pickle.dump(data, f)
In [22]:
import pickle

with open("data.pkl", "rb") as f:
    loaded_data = pickle.load(f)
In [23]:
import pickle

# Load the trained model
with open("your_model.pkl", "rb") as model_file:
    model = pickle.load(model_file)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[23], line 4
      1 import pickle
      3 # Load the trained model
----> 4 with open("your_model.pkl", "rb") as model_file:
      5     model = pickle.load(model_file)

File ~\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:282, in _modified_open(file, *args, **kwargs)
    275 if file in {0, 1, 2}:
    276     raise ValueError(
    277         f"IPython won't let you open fd={file} by default "
    278         "as it is likely to crash IPython. If you know what you are doing, "
    279         "you can use builtins' open."
    280     )
--> 282 return io_open(file, *args, **kwargs)

FileNotFoundError: [Errno 2] No such file or directory: 'your_model.pkl'
In [25]:
import pickle
import pandas as pd

# Load the trained model
with open("data.pkl", "rb") as model_file:
    model = pickle.load(model_file)

# Load new data from the CSV file
new_data = pd.read_csv("data.csv")

# Prepare the new data (assuming you need to preprocess it)
# You might need to perform any preprocessing steps applied to the original training data

# Make predictions
predictions = sensor_data.predict(new_data)

# Print or analyze the predictions
print(predictions)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[25], line 15
      9 new_data = pd.read_csv("data.csv")
     11 # Prepare the new data (assuming you need to preprocess it)
     12 # You might need to perform any preprocessing steps applied to the original training data
     13 
     14 # Make predictions
---> 15 predictions = sensor_data.predict(new_data)
     17 # Print or analyze the predictions
     18 print(predictions)

NameError: name 'sensor_data' is not defined
In [26]:
import pickle
import pandas as pd

# Load the trained model
with open("data.pkl", "rb") as model_file:
    model = pickle.load(model_file)
In [27]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Assuming df is your DataFrame
# Replace this with your actual data loading and preprocessing steps
df = pd.read_csv("data.csv")
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the machine learning model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Store accuracy and prediction data
model_data = {
    'model': model,
    'accuracy': accuracy,
    'predictions': y_pred,
}

# Dump data into a pickle file
with open("model_and_data.pkl", "wb") as file:
    pickle.dump(model_data, file)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[27], line 18
     16 # Train the machine learning model
     17 model = RandomForestClassifier()
---> 18 model.fit(X_train, y_train)
     20 # Make predictions on the test set
     21 y_pred = model.predict(X_test)

File ~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py:345, in BaseForest.fit(self, X, y, sample_weight)
    343 if issparse(y):
    344     raise ValueError("sparse multilabel-indicator for y is not supported.")
--> 345 X, y = self._validate_data(
    346     X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
    347 )
    348 if sample_weight is not None:
    349     sample_weight = _check_sample_weight(sample_weight, X)

File ~\anaconda3\lib\site-packages\sklearn\base.py:565, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
    563         y = check_array(y, input_name="y", **check_y_params)
    564     else:
--> 565         X, y = check_X_y(X, y, **check_params)
    566     out = X, y
    568 if not no_val_X and check_params.get("ensure_2d", True):

File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:1106, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
   1101         estimator_name = _check_estimator_name(estimator)
   1102     raise ValueError(
   1103         f"{estimator_name} requires y to be passed, but the target y is None"
   1104     )
-> 1106 X = check_array(
   1107     X,
   1108     accept_sparse=accept_sparse,
   1109     accept_large_sparse=accept_large_sparse,
   1110     dtype=dtype,
   1111     order=order,
   1112     copy=copy,
   1113     force_all_finite=force_all_finite,
   1114     ensure_2d=ensure_2d,
   1115     allow_nd=allow_nd,
   1116     ensure_min_samples=ensure_min_samples,
   1117     ensure_min_features=ensure_min_features,
   1118     estimator=estimator,
   1119     input_name="X",
   1120 )
   1122 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
   1124 check_consistent_length(X, y)

File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:879, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    877         array = xp.astype(array, dtype, copy=False)
    878     else:
--> 879         array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
    880 except ComplexWarning as complex_warning:
    881     raise ValueError(
    882         "Complex data not supported\n{}\n".format(array)
    883     ) from complex_warning

File ~\anaconda3\lib\site-packages\sklearn\utils\_array_api.py:185, in _asarray_with_order(array, dtype, order, copy, xp)
    182     xp, _ = get_namespace(array)
    183 if xp.__name__ in {"numpy", "numpy.array_api"}:
    184     # Use NumPy API to support order
--> 185     array = numpy.asarray(array, order=order, dtype=dtype)
    186     return xp.asarray(array, copy=copy)
    187 else:

File ~\anaconda3\lib\site-packages\pandas\core\generic.py:2070, in NDFrame.__array__(self, dtype)
   2069 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
-> 2070     return np.asarray(self._values, dtype=dtype)

ValueError: could not convert string to float: 'NanoSensor11'
In [28]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

# Load your dataset (replace "your_data.csv" with your actual filename)
df = pd.read_csv("data.csv")

# Identify and separate categorical columns (assuming 'sensor_name' is categorical)
categorical_columns = ['sensor_name']
numeric_columns = df.columns.difference(categorical_columns)

# One-hot encode categorical columns
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_columns = pd.DataFrame(encoder.fit_transform(df[categorical_columns]), columns=encoder.get_feature_names_out(categorical_columns))
df_encoded = pd.concat([df[numeric_columns], encoded_columns], axis=1)

# Split the data into features and target variable
X = df_encoded.drop('diagnosis', axis=1)
y = df['diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the machine learning model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Store accuracy and prediction data
model_data = {
    'model': model,
    'accuracy': accuracy,
    'predictions': y_pred,
}

# Dump data into a pickle file
with open("model_and_data.pkl", "wb") as file:
    pickle.dump(model_data, file)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[28], line 29
     27 # Train the machine learning model
     28 model = RandomForestClassifier()
---> 29 model.fit(X_train, y_train)
     31 # Make predictions on the test set
     32 y_pred = model.predict(X_test)

File ~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py:345, in BaseForest.fit(self, X, y, sample_weight)
    343 if issparse(y):
    344     raise ValueError("sparse multilabel-indicator for y is not supported.")
--> 345 X, y = self._validate_data(
    346     X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
    347 )
    348 if sample_weight is not None:
    349     sample_weight = _check_sample_weight(sample_weight, X)

File ~\anaconda3\lib\site-packages\sklearn\base.py:565, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
    563         y = check_array(y, input_name="y", **check_y_params)
    564     else:
--> 565         X, y = check_X_y(X, y, **check_params)
    566     out = X, y
    568 if not no_val_X and check_params.get("ensure_2d", True):

File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:1106, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
   1101         estimator_name = _check_estimator_name(estimator)
   1102     raise ValueError(
   1103         f"{estimator_name} requires y to be passed, but the target y is None"
   1104     )
-> 1106 X = check_array(
   1107     X,
   1108     accept_sparse=accept_sparse,
   1109     accept_large_sparse=accept_large_sparse,
   1110     dtype=dtype,
   1111     order=order,
   1112     copy=copy,
   1113     force_all_finite=force_all_finite,
   1114     ensure_2d=ensure_2d,
   1115     allow_nd=allow_nd,
   1116     ensure_min_samples=ensure_min_samples,
   1117     ensure_min_features=ensure_min_features,
   1118     estimator=estimator,
   1119     input_name="X",
   1120 )
   1122 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
   1124 check_consistent_length(X, y)

File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:879, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    877         array = xp.astype(array, dtype, copy=False)
    878     else:
--> 879         array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
    880 except ComplexWarning as complex_warning:
    881     raise ValueError(
    882         "Complex data not supported\n{}\n".format(array)
    883     ) from complex_warning

File ~\anaconda3\lib\site-packages\sklearn\utils\_array_api.py:185, in _asarray_with_order(array, dtype, order, copy, xp)
    182     xp, _ = get_namespace(array)
    183 if xp.__name__ in {"numpy", "numpy.array_api"}:
    184     # Use NumPy API to support order
--> 185     array = numpy.asarray(array, order=order, dtype=dtype)
    186     return xp.asarray(array, copy=copy)
    187 else:

File ~\anaconda3\lib\site-packages\pandas\core\generic.py:2070, in NDFrame.__array__(self, dtype)
   2069 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
-> 2070     return np.asarray(self._values, dtype=dtype)

ValueError: could not convert string to float: 'Aquaporin-1 (AQP1)'
In [29]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

# Load your dataset (replace "your_data.csv" with your actual filename)
df = pd.read_csv("data.csv")

# Identify and separate categorical columns (assuming 'sensor_name' is categorical)
categorical_columns = ['sensor_name']
numeric_columns = df.columns.difference(categorical_columns)

# One-hot encode categorical columns
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_columns = pd.DataFrame(encoder.fit_transform(df[categorical_columns].astype(str)), columns=encoder.get_feature_names_out(categorical_columns))
df_encoded = pd.concat([df[numeric_columns], encoded_columns], axis=1)

# Split the data into features and target variable
X = df_encoded.drop('diagnosis', axis=1)
y = df['diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the machine learning model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Store accuracy and prediction data
model_data = {
    'model': model,
    'accuracy': accuracy,
    'predictions': y_pred,
}

# Dump data into a pickle file
with open("model_and_data.pkl", "wb") as file:
    pickle.dump(model_data, file)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[29], line 29
     27 # Train the machine learning model
     28 model = RandomForestClassifier()
---> 29 model.fit(X_train, y_train)
     31 # Make predictions on the test set
     32 y_pred = model.predict(X_test)

File ~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py:345, in BaseForest.fit(self, X, y, sample_weight)
    343 if issparse(y):
    344     raise ValueError("sparse multilabel-indicator for y is not supported.")
--> 345 X, y = self._validate_data(
    346     X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
    347 )
    348 if sample_weight is not None:
    349     sample_weight = _check_sample_weight(sample_weight, X)

File ~\anaconda3\lib\site-packages\sklearn\base.py:565, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
    563         y = check_array(y, input_name="y", **check_y_params)
    564     else:
--> 565         X, y = check_X_y(X, y, **check_params)
    566     out = X, y
    568 if not no_val_X and check_params.get("ensure_2d", True):

File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:1106, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
   1101         estimator_name = _check_estimator_name(estimator)
   1102     raise ValueError(
   1103         f"{estimator_name} requires y to be passed, but the target y is None"
   1104     )
-> 1106 X = check_array(
   1107     X,
   1108     accept_sparse=accept_sparse,
   1109     accept_large_sparse=accept_large_sparse,
   1110     dtype=dtype,
   1111     order=order,
   1112     copy=copy,
   1113     force_all_finite=force_all_finite,
   1114     ensure_2d=ensure_2d,
   1115     allow_nd=allow_nd,
   1116     ensure_min_samples=ensure_min_samples,
   1117     ensure_min_features=ensure_min_features,
   1118     estimator=estimator,
   1119     input_name="X",
   1120 )
   1122 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
   1124 check_consistent_length(X, y)

File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:879, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    877         array = xp.astype(array, dtype, copy=False)
    878     else:
--> 879         array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
    880 except ComplexWarning as complex_warning:
    881     raise ValueError(
    882         "Complex data not supported\n{}\n".format(array)
    883     ) from complex_warning

File ~\anaconda3\lib\site-packages\sklearn\utils\_array_api.py:185, in _asarray_with_order(array, dtype, order, copy, xp)
    182     xp, _ = get_namespace(array)
    183 if xp.__name__ in {"numpy", "numpy.array_api"}:
    184     # Use NumPy API to support order
--> 185     array = numpy.asarray(array, order=order, dtype=dtype)
    186     return xp.asarray(array, copy=copy)
    187 else:

File ~\anaconda3\lib\site-packages\pandas\core\generic.py:2070, in NDFrame.__array__(self, dtype)
   2069 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
-> 2070     return np.asarray(self._values, dtype=dtype)

ValueError: could not convert string to float: 'Aquaporin-1 (AQP1)'
In [30]:
import pickle

# Your classification report data
classification_report_data = {
    'accuracy': 0.96,
    'precision': {
        'B': 0.96,
        'M': 0.96,
    },
    'recall': {
        'B': 0.97,
        'M': 0.94,
    },
    'f1-score': {
        'B': 0.96,
        'M': 0.95,
    },
    'support': {
        'B': 67,
        'M': 47,
    },
}

# Set your accuracy threshold
accuracy_threshold = 0.95

# Check if there's cancer and if it exceeds the accuracy threshold
if classification_report_data['accuracy'] > accuracy_threshold:
    print("There's cancer, and the accuracy exceeds the threshold.")
else:
    print("There's cancer, but the accuracy doesn't exceed the threshold.")

# Save to pickle file
with open('classification_report.pkl', 'wb') as file:
    pickle.dump(classification_report_data, file)
There's cancer, and the accuracy exceeds the threshold.
In [31]:
import pickle

# Load the pickle file
with open('C:/Users/karri/classification_report.pkl', 'rb') as file:
    classification_report_data = pickle.load(file)

# Print the contents
print(classification_report_data)
{'accuracy': 0.96, 'precision': {'B': 0.96, 'M': 0.96}, 'recall': {'B': 0.97, 'M': 0.94}, 'f1-score': {'B': 0.96, 'M': 0.95}, 'support': {'B': 67, 'M': 47}}
In [32]:
from flask import Flask, render_template, request
import pickle
import matplotlib.pyplot as plt
from io import BytesIO
import base64

app = Flask(__name__)

# Load the model and threshold from the pickle file
with open('classification_report.pkl', 'rb') as file:
    classification_report_data = pickle.load(file)

# Set your threshold
threshold = 0.95

def generate_plot(prediction):
    # Replace this with your actual logic to generate the plot based on user input
    # For simplicity, we'll create a bar chart with random data
    labels = ['Precision', 'Recall', 'F1-Score']
    values = [0.9, 0.8, 0.85]  # Replace with your actual values

    plt.bar(labels, values)
    plt.title(f'Metrics for {prediction} class')
    plt.xlabel('Metrics')
    plt.ylabel('Values')

    # Save the plot to a BytesIO object
    image_stream = BytesIO()
    plt.savefig(image_stream, format='png')
    image_stream.seek(0)
    plt.close()

    # Convert the BytesIO object to a base64-encoded string
    plot_url = base64.b64encode(image_stream.read()).decode('utf-8')

    return f"data:image/png;base64,{plot_url}"

@app.route('/')
def home():
    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():
    if request.method == 'POST':
        # Get user input from the form
        # Assuming you have form fields like 'feature1', 'feature2', etc.
        feature1 = float(request.form['feature1'])
        feature2 = float(request.form['feature2'])
        # Add more features as needed

        # Make predictions based on the user input
        # Replace this with your actual prediction logic using the loaded model
        # For simplicity, we'll assume a random prediction here
        prediction = 'B'  # Replace with your actual prediction

        # Check if the predicted probability exceeds the threshold
        if classification_report_data['precision'][prediction] > threshold:
            result = f"The model predicts there is cancer ({prediction})."
        else:
            result = f"The model predicts there is no cancer ({prediction})."

        # Generate a plot based on user input
        plot_url = generate_plot(prediction)

   
In [1]:
!pip install flask
Requirement already satisfied: flask in c:\users\karri\anaconda3\lib\site-packages (2.2.2)
Requirement already satisfied: Werkzeug>=2.2.2 in c:\users\karri\anaconda3\lib\site-packages (from flask) (2.2.2)
Requirement already satisfied: click>=8.0 in c:\users\karri\anaconda3\lib\site-packages (from flask) (8.0.4)
Requirement already satisfied: Jinja2>=3.0 in c:\users\karri\anaconda3\lib\site-packages (from flask) (3.1.2)
Requirement already satisfied: itsdangerous>=2.0 in c:\users\karri\anaconda3\lib\site-packages (from flask) (2.0.1)
Requirement already satisfied: colorama in c:\users\karri\anaconda3\lib\site-packages (from click>=8.0->flask) (0.4.6)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\karri\anaconda3\lib\site-packages (from Jinja2>=3.0->flask) (2.1.1)
In [ ]: